import sys, os
sys.path.append(os.path.abspath(".."))Clustering Execution & Optimization
Resumen
Optimización de hiperparámetros y ejecución de algoritmos de clustering sobre espacios MOFA y UMAP.
Añadir al sistema al ruta al directorio base para importar módulos personalizados.
Importar librerías y módulos necesarios.
import pandas as pd
import src.data_utils as du
import src.clustering_utils as cu
import src.plots as p
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as npImportar datos procesados de MOFA y UMAP.
DATA_DIR = "../data/"
# Factores de MOFA
m_factors = du.load_data(DATA_DIR + "MOFA_dir/M_factors", index_col=0)
# Caracteristicas de UMAP
m_umap = du.load_data(DATA_DIR + "processed_data/M_umap", index_col=0)Ejecución de K-Means
Datos MOFA
Selección de Hyperparámetros óptimos
mofa_kmeans_res = cu.run_kmeans_optimization(m_factors, k_range=range(2, 12))
opt_k_mofa = mofa_kmeans_res["optimal_k"]
print(f"MOFA Optimal k detected: {opt_k_mofa}")
p.plot_elbow_silhouette(
mofa_kmeans_res["k_range"],
mofa_kmeans_res["inertias"],
mofa_kmeans_res["silhouettes"],
opt_k_mofa,
"MOFA Clustering",
)MOFA Optimal k detected: 7
Matriz de proximidad de los clusters
labels_mofa = mofa_kmeans_res["results"][8]["labels"]
m_factors_ordered = m_factors.copy()
m_factors_ordered["Cluster"] = labels_mofa
m_factors_ordered = m_factors_ordered.sort_values("Cluster")
dist_matrix = cu.get_proximity_matrix(m_factors_ordered.drop(columns="Cluster"))
plt.figure(figsize=(8, 6))
sns.heatmap(dist_matrix, cmap="viridis", xticklabels=False, yticklabels=False)
plt.title("Proximity Matrix (Sorted by K-Means MOFA Cluster)")
plt.show()Datos UMAP
Selección de Hyperparámetros óptimos
umap_kmeans_res = cu.run_kmeans_optimization(m_umap, k_range=range(2, 12))
opt_k_umap = umap_kmeans_res["optimal_k"]
selected_k_umap = 8
print(f"UMAP Optimal k detected: {opt_k_umap}")
p.plot_elbow_silhouette(
umap_kmeans_res["k_range"],
umap_kmeans_res["inertias"],
umap_kmeans_res["silhouettes"],
selected_k_umap,
"UMAP Clustering",
)UMAP Optimal k detected: 6
Matriz de proximidad de los clusters
labels_umap = umap_kmeans_res["results"][8]["labels"]
m_umap_ordered = m_umap.copy()
m_umap_ordered["Cluster"] = labels_umap
m_umap_ordered = m_umap_ordered.sort_values("Cluster")
dist_matrix = cu.get_proximity_matrix(m_umap_ordered.drop(columns="Cluster"))
plt.figure(figsize=(8, 6))
sns.heatmap(dist_matrix, cmap="viridis", xticklabels=False, yticklabels=False)
plt.title("Proximity Matrix (Sorted by K-Means UMAP Cluster)")
plt.show()Ejecución de GMM
Datos MOFA
Selección de Hyperparámetros óptimos
gmm_res = cu.optimize_gmm(m_factors, k_range=range(2, 12))
p.plot_gmm_optimization(
gmm_res["k_range"], gmm_res["bics"], gmm_res["aics"], gmm_res["silhouettes"]
)
best_k_gmm = gmm_res["k_range"][np.argmin(gmm_res["bics"])]
print(f"Best K based on BIC: {best_k_gmm}")
k_gmm_mofa = 8 # Seleccionado manualmente
labels_gmm_opt = gmm_res["results"][k_gmm_mofa]["labels"]Best K based on BIC: 9
Matriz de proximidad de los clusters
m_factors_ordered_gmm = m_factors.copy()
m_factors_ordered_gmm["Cluster"] = labels_gmm_opt
m_factors_ordered_gmm = m_factors_ordered_gmm.sort_values("Cluster")
dist_matrix_gmm = cu.get_proximity_matrix(
m_factors_ordered_gmm.drop(columns="Cluster")
)
plt.figure(figsize=(8, 6))
sns.heatmap(dist_matrix_gmm, cmap="viridis", xticklabels=False, yticklabels=False)
plt.title("Proximity Matrix (Sorted by GMM MOFA Cluster)")
plt.show()Datos UMAP
Selección de Hyperparámetros óptimos
gmm_umap_res = cu.optimize_gmm(m_umap, k_range=range(2, 12))
p.plot_gmm_optimization(
gmm_umap_res["k_range"], gmm_umap_res["bics"], gmm_umap_res["aics"], gmm_umap_res["silhouettes"]
)
best_k_gmm_umap = gmm_umap_res["k_range"][np.argmin(gmm_umap_res["bics"])]
print(f"Best K based on BIC: {best_k_gmm_umap}")
k_gmm_umap = 8 # Seleccionado manualmente
labels_gmm_umap_opt = gmm_umap_res["results"][k_gmm_umap]["labels"]Best K based on BIC: 8
Matriz de proximidad de los clusters
m_umap_ordered_gmm = m_umap.copy()
m_umap_ordered_gmm["Cluster"] = labels_gmm_umap_opt
m_umap_ordered_gmm = m_umap_ordered_gmm.sort_values("Cluster")
dist_matrix_gmm_umap = cu.get_proximity_matrix(
m_umap_ordered_gmm.drop(columns="Cluster")
)
plt.figure(figsize=(8, 6))
sns.heatmap(dist_matrix_gmm_umap, cmap="viridis", xticklabels=False, yticklabels=False)
plt.title("Proximity Matrix (Sorted by GMM UMAP Cluster)")
plt.show()Ejecución de DBSCAN
Datos MOFA
Selección de Hyperparámetros óptimos
eps_candidates = np.arange(0.1, 2.0, 0.1)
min_samples_candidates = [3, 5, 10]
dbscan_table = cu.grid_search_dbscan(m_factors, eps_candidates, min_samples_candidates)
dbscan_table_filtered = dbscan_table[
(dbscan_table["n_clusters"] > 1) & (dbscan_table["noise_ratio"] < 1)
].sort_values("silhouette", ascending=False)
display(dbscan_table_filtered.head(10))| eps | min_samples | n_clusters | n_noise | noise_ratio | silhouette | labels | |
|---|---|---|---|---|---|---|---|
| 39 | 1.4 | 3 | 2 | 153 | 0.962 | -0.154 | [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -... |
| 42 | 1.5 | 3 | 2 | 153 | 0.962 | -0.154 | [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -... |
| 54 | 1.9 | 3 | 9 | 120 | 0.755 | -0.181 | [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -... |
| 45 | 1.6 | 3 | 4 | 146 | 0.918 | -0.207 | [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -... |
| 48 | 1.7 | 3 | 6 | 138 | 0.868 | -0.224 | [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -... |
| 51 | 1.8 | 3 | 8 | 129 | 0.811 | -0.237 | [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -... |
k_neighbors = 5
distances = cu.calculate_k_distance(m_factors, k=k_neighbors)
p.plot_k_distance_curve(distances, k=k_neighbors)labels_dbscan_opt = cu.run_dbscan(m_factors, eps=4, min_samples=5)["labels"]Matriz de proximidad de los clusters
m_factors_ordered_dbscan = m_factors.copy()
m_factors_ordered_dbscan["Cluster"] = labels_dbscan_opt
m_factors_ordered_dbscan = m_factors_ordered_dbscan.sort_values("Cluster")
dist_matrix_dbscan = cu.get_proximity_matrix(
m_factors_ordered_dbscan.drop(columns="Cluster")
)
plt.figure(figsize=(8, 6))
sns.heatmap(dist_matrix_dbscan, cmap="viridis", xticklabels=False, yticklabels=False)
plt.title("Proximity Matrix (Sorted by DBSCAN MOFA Cluster)")
plt.show()Datos UMAP
Selección de Hyperparámetros óptimos
eps_candidates = np.arange(0.1, 2.0, 0.1)
min_samples_candidates = [3, 5, 10]
dbscan_umap_table = cu.grid_search_dbscan(m_umap, eps_candidates, min_samples_candidates)
dbscan_umap_table_filtered = dbscan_umap_table[
(dbscan_umap_table["n_clusters"] > 1) & (dbscan_umap_table["noise_ratio"] < 1)
].sort_values("silhouette", ascending=False)
display(dbscan_umap_table_filtered.head(10))| eps | min_samples | n_clusters | n_noise | noise_ratio | silhouette | labels | |
|---|---|---|---|---|---|---|---|
| 22 | 0.8 | 5 | 6 | 0 | 0.000 | 0.608 | [0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, ... |
| 18 | 0.7 | 3 | 6 | 0 | 0.000 | 0.608 | [0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, ... |
| 23 | 0.8 | 10 | 7 | 1 | 0.006 | 0.594 | [0, 0, 2, 0, 0, 2, 2, 1, 2, 2, 1, 1, 0, 2, 3, ... |
| 20 | 0.7 | 10 | 9 | 8 | 0.050 | 0.590 | [-1, 0, 2, 0, 0, 2, 2, 1, 2, 2, 1, 1, 3, 2, 4,... |
| 21 | 0.8 | 3 | 5 | 0 | 0.000 | 0.580 | [0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, ... |
| 19 | 0.7 | 5 | 6 | 1 | 0.006 | 0.570 | [0, 0, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 0, 1, 1, ... |
| 15 | 0.6 | 3 | 7 | 0 | 0.000 | 0.554 | [0, 0, 1, 2, 0, 1, 1, 3, 1, 1, 3, 3, 2, 1, 1, ... |
| 13 | 0.5 | 5 | 10 | 7 | 0.044 | 0.545 | [0, 0, 1, 3, 0, 1, 1, 2, 1, -1, 2, 2, 3, 6, 4,... |
| 25 | 0.9 | 5 | 5 | 0 | 0.000 | 0.544 | [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, ... |
| 29 | 1.0 | 10 | 5 | 0 | 0.000 | 0.544 | [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, ... |
k_neighbors = 5
distances_umap = cu.calculate_k_distance(m_umap, k=k_neighbors)
p.plot_k_distance_curve(distances_umap, k=k_neighbors)labels_dbscan_umap_opt = cu.run_dbscan(m_umap, eps=0.4, min_samples=5)["labels"]Matriz de proximidad de los clusters
m_umap_ordered_dbscan = m_umap.copy()
m_umap_ordered_dbscan["Cluster"] = labels_dbscan_umap_opt
m_umap_ordered_dbscan = m_umap_ordered_dbscan.sort_values("Cluster")
dist_matrix_dbscan_umap = cu.get_proximity_matrix(
m_umap_ordered_dbscan.drop(columns="Cluster")
)
plt.figure(figsize=(8, 6))
sns.heatmap(dist_matrix_dbscan_umap, cmap="viridis", xticklabels=False, yticklabels=False)
plt.title("Proximity Matrix (Sorted by DBSCAN UMAP Cluster)")
plt.show()Guardar mejores modelos de clustering
# OPCIONES DISPONIBLES:
# labels_kmeans_opt / labels_umap_kmeans_opt
# labels_gmm_opt / labels_gmm_umap_opt
# labels_dbscan_opt / labels_dbscan_umap_opt
CLUTERING_DIR = DATA_DIR + "clustering_dir/"
selected_models = {
"GMM_Opt": labels_gmm_opt,
"GMM_UMAP_Opt": labels_gmm_umap_opt,
}
for model_name, labels in selected_models.items():
df_selected = pd.DataFrame(labels, index=m_factors.index, columns=["Cluster"])
du.save_data(df_selected, CLUTERING_DIR + f"selected_clusters_{model_name}")